--- linux-2.6.32/drivers/infiniband/Kconfig~ 2009-12-05 00:26:03.663774916 +0100 +++ linux-2.6.32/drivers/infiniband/Kconfig 2009-12-05 00:26:05.914179759 +0100 @@ -37,7 +37,6 @@ config INFINIBAND_ADDR_TRANS bool depends on INET - depends on !(INFINIBAND = y && IPV6 = m) default y source "drivers/infiniband/hw/mthca/Kconfig" --- linux-2.6.33/scripts/mod/modpost.c~ 2010-02-24 19:52:17.000000000 +0100 +++ linux-2.6.33/scripts/mod/modpost.c 2010-03-07 14:26:47.242168558 +0100 @@ -15,7 +15,8 @@ #include #include #include "modpost.h" -#include "../../include/generated/autoconf.h" +// PLD architectures don't use CONFIG_SYMBOL_PREFIX +//#include "../../include/generated/autoconf.h" #include "../../include/linux/license.h" /* Some toolchains use a `_' prefix for all user symbols. */ commit 87b09f1f25cd1e01d7c50bf423c7fe33027d7511 Author: stephen hemminger Date: Fri Feb 12 06:58:00 2010 +0000 sky2: dont enable PME legacy mode This bit is not changed by vendor driver, and should be left alone. The documentation implies this a debug bit. 0 = WAKE# only asserted when VMAIN not available 1 = WAKE# is depend on wake events and independent of VMAIN. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller diff --git b/drivers/net/sky2.c a/drivers/net/sky2.c index 2494842..edf37aa 100644 --- b/drivers/net/sky2.c +++ a/drivers/net/sky2.c @@ -733,6 +733,7 @@ static void sky2_wol_init(struct sky2_port *sky2) unsigned port = sky2->port; enum flow_control save_mode; u16 ctrl; + u32 reg1; /* Bring hardware out of reset */ sky2_write16(hw, B0_CTST, CS_RST_CLR); @@ -786,6 +787,11 @@ static void sky2_wol_init(struct sky2_port *sky2) /* Disable PiG firmware */ sky2_write16(hw, B0_CTST, Y2_HW_WOL_OFF); + /* Turn on legacy PCI-Express PME mode */ + reg1 = sky2_pci_read32(hw, PCI_DEV_REG1); + reg1 |= PCI_Y2_PME_LEGACY; + sky2_pci_write32(hw, PCI_DEV_REG1, reg1); + /* block receiver */ sky2_write8(hw, SK_REG(port, RX_GMF_CTRL_T), GMF_RST_SET); } On Sat, 2 Jul 2011, Andi Kleen wrote: > > The problem is that blk_peek_request() calls scsi_prep_fn(), which > > does this: > > > > struct scsi_device *sdev = q->queuedata; > > int ret = BLKPREP_KILL; > > > > if (req->cmd_type == REQ_TYPE_BLOCK_PC) > > ret = scsi_setup_blk_pc_cmnd(sdev, req); > > return scsi_prep_return(q, req, ret); > > > > It doesn't check to see if sdev is NULL, nor does > > scsi_setup_blk_pc_cmnd(). That accounts for this error: > > I actually added a NULL check in scsi_setup_blk_pc_cmnd early on, > but that just caused RCU CPU stalls afterwards and then eventually > a hung system. The RCU problem is likely to be a separate issue. It might even be a result of the use-after-free problem with the elevator. At any rate, it's clear that the crash in the refcounting log you posted occurred because scsi_setup_blk_pc_cmnd() called scsi_prep_state_check(), which tried to dereference the NULL pointer. Would you like to try this patch to see if it fixes the problem? As I said before, I'm not certain it's the best thing to do, but it worked on my system. Alan Stern Index: usb-3.0/drivers/scsi/scsi_lib.c =================================================================== --- usb-3.0.orig/drivers/scsi/scsi_lib.c +++ usb-3.0/drivers/scsi/scsi_lib.c @@ -1247,6 +1247,8 @@ int scsi_prep_fn(struct request_queue *q struct scsi_device *sdev = q->queuedata; int ret = BLKPREP_KILL; + if (!sdev) + return ret; if (req->cmd_type == REQ_TYPE_BLOCK_PC) ret = scsi_setup_blk_pc_cmnd(sdev, req); return scsi_prep_return(q, req, ret); Index: usb-3.0/drivers/scsi/scsi_sysfs.c =================================================================== --- usb-3.0.orig/drivers/scsi/scsi_sysfs.c +++ usb-3.0/drivers/scsi/scsi_sysfs.c @@ -322,6 +322,8 @@ static void scsi_device_dev_release_user kfree(evt); } + /* Freeing the queue signals to block that we're done */ + scsi_free_queue(sdev->request_queue); blk_put_queue(sdev->request_queue); /* NULL queue means the device can't be used */ sdev->request_queue = NULL; @@ -936,8 +938,6 @@ void __scsi_remove_device(struct scsi_de /* cause the request function to reject all I/O requests */ sdev->request_queue->queuedata = NULL; - /* Freeing the queue signals to block that we're done */ - scsi_free_queue(sdev->request_queue); put_device(dev); } -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/ commit 3326c784c9f492e988617d93f647ae0cfd4c8d09 Author: Jiri Pirko Date: Wed Jul 20 04:54:38 2011 +0000 forcedeth: do vlan cleanup - unify vlan and nonvlan rx path - kill np->vlangrp and nv_vlan_rx_register - allow to turn on/off rx vlan accel via ethtool (set_features) Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller diff --git a/drivers/net/forcedeth.c b/drivers/net/forcedeth.c index 537b695..e64cd9c 100644 --- a/drivers/net/forcedeth.c +++ b/drivers/net/forcedeth.c @@ -820,9 +820,6 @@ struct fe_priv { struct nv_skb_map *tx_end_flip; int tx_stop; - /* vlan fields */ - struct vlan_group *vlangrp; - /* msi/msi-x fields */ u32 msi_flags; struct msix_entry msi_x_entry[NV_MSI_X_MAX_VECTORS]; @@ -2766,17 +2763,13 @@ static int nv_rx_process_optimized(struct net_device *dev, int limit) skb->protocol = eth_type_trans(skb, dev); prefetch(skb->data); - if (likely(!np->vlangrp)) { - napi_gro_receive(&np->napi, skb); - } else { - vlanflags = le32_to_cpu(np->get_rx.ex->buflow); - if (vlanflags & NV_RX3_VLAN_TAG_PRESENT) { - vlan_gro_receive(&np->napi, np->vlangrp, - vlanflags & NV_RX3_VLAN_TAG_MASK, skb); - } else { - napi_gro_receive(&np->napi, skb); - } + vlanflags = le32_to_cpu(np->get_rx.ex->buflow); + if (vlanflags & NV_RX3_VLAN_TAG_PRESENT) { + u16 vid = vlanflags & NV_RX3_VLAN_TAG_MASK; + + __vlan_hwaccel_put_tag(skb, vid); } + napi_gro_receive(&np->napi, skb); dev->stats.rx_packets++; dev->stats.rx_bytes += len; @@ -4484,6 +4477,27 @@ static u32 nv_fix_features(struct net_device *dev, u32 features) return features; } +static void nv_vlan_mode(struct net_device *dev, u32 features) +{ + struct fe_priv *np = get_nvpriv(dev); + + spin_lock_irq(&np->lock); + + if (features & NETIF_F_HW_VLAN_RX) + np->txrxctl_bits |= NVREG_TXRXCTL_VLANSTRIP; + else + np->txrxctl_bits &= ~NVREG_TXRXCTL_VLANSTRIP; + + if (features & NETIF_F_HW_VLAN_TX) + np->txrxctl_bits |= NVREG_TXRXCTL_VLANINS; + else + np->txrxctl_bits &= ~NVREG_TXRXCTL_VLANINS; + + writel(np->txrxctl_bits, get_hwbase(dev) + NvRegTxRxControl); + + spin_unlock_irq(&np->lock); +} + static int nv_set_features(struct net_device *dev, u32 features) { struct fe_priv *np = netdev_priv(dev); @@ -4504,6 +4518,9 @@ static int nv_set_features(struct net_device *dev, u32 features) spin_unlock_irq(&np->lock); } + if (changed & (NETIF_F_HW_VLAN_TX | NETIF_F_HW_VLAN_RX)) + nv_vlan_mode(dev, features); + return 0; } @@ -4879,29 +4896,6 @@ static const struct ethtool_ops ops = { .self_test = nv_self_test, }; -static void nv_vlan_rx_register(struct net_device *dev, struct vlan_group *grp) -{ - struct fe_priv *np = get_nvpriv(dev); - - spin_lock_irq(&np->lock); - - /* save vlan group */ - np->vlangrp = grp; - - if (grp) { - /* enable vlan on MAC */ - np->txrxctl_bits |= NVREG_TXRXCTL_VLANSTRIP | NVREG_TXRXCTL_VLANINS; - } else { - /* disable vlan on MAC */ - np->txrxctl_bits &= ~NVREG_TXRXCTL_VLANSTRIP; - np->txrxctl_bits &= ~NVREG_TXRXCTL_VLANINS; - } - - writel(np->txrxctl_bits, get_hwbase(dev) + NvRegTxRxControl); - - spin_unlock_irq(&np->lock); -} - /* The mgmt unit and driver use a semaphore to access the phy during init */ static int nv_mgmt_acquire_sema(struct net_device *dev) { @@ -5208,7 +5202,6 @@ static const struct net_device_ops nv_netdev_ops = { .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = nv_set_mac_address, .ndo_set_multicast_list = nv_set_multicast, - .ndo_vlan_rx_register = nv_vlan_rx_register, #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_poll_controller = nv_poll_controller, #endif @@ -5226,7 +5219,6 @@ static const struct net_device_ops nv_netdev_ops_optimized = { .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = nv_set_mac_address, .ndo_set_multicast_list = nv_set_multicast, - .ndo_vlan_rx_register = nv_vlan_rx_register, #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_poll_controller = nv_poll_controller, #endif commit 0891b0e08937aaec2c4734acb94c5ff8042313bb Author: Jiri Pirko Date: Tue Jul 26 10:19:28 2011 +0000 forcedeth: fix vlans For some reason, when rxaccel is disabled, NV_RX3_VLAN_TAG_PRESENT is still set and some pseudorandom vids appear. So check for NETIF_F_HW_VLAN_RX as well. Also set correctly hw_features and set vlan mode on probe. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller diff --git a/drivers/net/forcedeth.c b/drivers/net/forcedeth.c index e64cd9c..e55df30 100644 --- a/drivers/net/forcedeth.c +++ b/drivers/net/forcedeth.c @@ -2764,7 +2764,14 @@ static int nv_rx_process_optimized(struct net_device *dev, int limit) prefetch(skb->data); vlanflags = le32_to_cpu(np->get_rx.ex->buflow); - if (vlanflags & NV_RX3_VLAN_TAG_PRESENT) { + + /* + * There's need to check for NETIF_F_HW_VLAN_RX here. + * Even if vlan rx accel is disabled, + * NV_RX3_VLAN_TAG_PRESENT is pseudo randomly set. + */ + if (dev->features & NETIF_F_HW_VLAN_RX && + vlanflags & NV_RX3_VLAN_TAG_PRESENT) { u16 vid = vlanflags & NV_RX3_VLAN_TAG_MASK; __vlan_hwaccel_put_tag(skb, vid); @@ -5331,15 +5338,16 @@ static int __devinit nv_probe(struct pci_dev *pci_dev, const struct pci_device_i np->txrxctl_bits |= NVREG_TXRXCTL_RXCHECK; dev->hw_features |= NETIF_F_IP_CSUM | NETIF_F_SG | NETIF_F_TSO | NETIF_F_RXCSUM; - dev->features |= dev->hw_features; } np->vlanctl_bits = 0; if (id->driver_data & DEV_HAS_VLAN) { np->vlanctl_bits = NVREG_VLANCONTROL_ENABLE; - dev->features |= NETIF_F_HW_VLAN_RX | NETIF_F_HW_VLAN_TX; + dev->hw_features |= NETIF_F_HW_VLAN_RX | NETIF_F_HW_VLAN_TX; } + dev->features |= dev->hw_features; + np->pause_flags = NV_PAUSEFRAME_RX_CAPABLE | NV_PAUSEFRAME_RX_REQ | NV_PAUSEFRAME_AUTONEG; if ((id->driver_data & DEV_HAS_PAUSEFRAME_TX_V1) || (id->driver_data & DEV_HAS_PAUSEFRAME_TX_V2) || @@ -5607,6 +5615,8 @@ static int __devinit nv_probe(struct pci_dev *pci_dev, const struct pci_device_i goto out_error; } + nv_vlan_mode(dev, dev->features); + netif_carrier_off(dev); dev_info(&pci_dev->dev, "ifname %s, PHY OUI 0x%x @ %d, addr %pM\n", --- linux-3.0/scripts/kconfig/lxdialog/check-lxdialog.sh~ 2011-07-22 04:17:23.000000000 +0200 +++ linux-3.0/scripts/kconfig/lxdialog/check-lxdialog.sh 2011-08-25 21:26:04.799150642 +0200 @@ -9,6 +9,12 @@ $cc -print-file-name=lib${lib}.${ext} | grep -q / if [ $? -eq 0 ]; then echo "-l${lib}" + for libt in tinfow tinfo ; do + $cc -print-file-name=lib${libt}.${ext} | grep -q / + if [ $? -eq 0 ]; then + echo "-l${libt}" + fi + done exit fi done commit 37b652ec6445be99d0193047d1eda129a1a315d3 Author: Dave Chinner Date: Thu Aug 25 07:17:01 2011 +0000 xfs: don't serialise direct IO reads on page cache checks There is no need to grab the i_mutex of the IO lock in exclusive mode if we don't need to invalidate the page cache. Taking these locks on every direct IO effective serialises them as taking the IO lock in exclusive mode has to wait for all shared holders to drop the lock. That only happens when IO is complete, so effective it prevents dispatch of concurrent direct IO reads to the same inode. Fix this by taking the IO lock shared to check the page cache state, and only then drop it and take the IO lock exclusively if there is work to be done. Hence for the normal direct IO case, no exclusive locking will occur. Signed-off-by: Dave Chinner Tested-by: Joern Engel Reviewed-by: Christoph Hellwig Signed-off-by: Alex Elder diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index 7f7b424..8fd4a07 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -317,7 +317,19 @@ xfs_file_aio_read( if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; - if (unlikely(ioflags & IO_ISDIRECT)) { + /* + * Locking is a bit tricky here. If we take an exclusive lock + * for direct IO, we effectively serialise all new concurrent + * read IO to this file and block it behind IO that is currently in + * progress because IO in progress holds the IO lock shared. We only + * need to hold the lock exclusive to blow away the page cache, so + * only take lock exclusively if the page cache needs invalidation. + * This allows the normal direct IO case of no page cache pages to + * proceeed concurrently without serialisation. + */ + xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); + if ((ioflags & IO_ISDIRECT) && inode->i_mapping->nrpages) { + xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); xfs_rw_ilock(ip, XFS_IOLOCK_EXCL); if (inode->i_mapping->nrpages) { @@ -330,8 +342,7 @@ xfs_file_aio_read( } } xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); - } else - xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); + } trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags); Start the periodic sync workers only after we have finished xfs_mountfs and thus fully set up the filesystem structures. Without this we can call into xfs_qm_sync before the quotainfo strucute is set up if the mount takes unusually long, and probably hit other incomplete states as well. Also clean up the xfs_fs_fill_super error path by using consistent label names, and removing an impossible to reach case. Signed-off-by: Christoph Hellwig Reported-by: Arkadiusz Miskiewicz Reviewed-by: Alex Elder diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index a1a881e..3ebb458 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -1412,37 +1412,35 @@ xfs_fs_fill_super( sb->s_time_gran = 1; set_posix_acl_flag(sb); - error = xfs_syncd_init(mp); - if (error) - goto out_filestream_unmount; - xfs_inode_shrinker_register(mp); error = xfs_mountfs(mp); if (error) - goto out_syncd_stop; + goto out_filestream_unmount; + + error = xfs_syncd_init(mp); + if (error) + goto out_unmount; root = igrab(VFS_I(mp->m_rootip)); if (!root) { error = ENOENT; - goto fail_unmount; + goto out_syncd_stop; } if (is_bad_inode(root)) { error = EINVAL; - goto fail_vnrele; + goto out_syncd_stop; } sb->s_root = d_alloc_root(root); if (!sb->s_root) { error = ENOMEM; - goto fail_vnrele; + goto out_iput; } return 0; - out_syncd_stop: - xfs_inode_shrinker_unregister(mp); - xfs_syncd_stop(mp); out_filestream_unmount: + xfs_inode_shrinker_unregister(mp); xfs_filestream_unmount(mp); out_free_sb: xfs_freesb(mp); @@ -1456,17 +1454,12 @@ xfs_fs_fill_super( out: return -error; - fail_vnrele: - if (sb->s_root) { - dput(sb->s_root); - sb->s_root = NULL; - } else { - iput(root); - } - - fail_unmount: - xfs_inode_shrinker_unregister(mp); + out_iput: + iput(root); + out_syncd_stop: xfs_syncd_stop(mp); + out_unmount: + xfs_inode_shrinker_unregister(mp); /* * Blow away any referenced inode in the filestreams cache. _______________________________________________ xfs mailing list xfs@oss.sgi.com http://oss.sgi.com/mailman/listinfo/xfs From: Dave Chinner commit 1d8c95a363bf8cd4d4182dd19c01693b635311c2 upstream xfs: use a cursor for bulk AIL insertion Delayed logging can insert tens of thousands of log items into the AIL at the same LSN. When the committing of log commit records occur, we can get insertions occurring at an LSN that is not at the end of the AIL. If there are thousands of items in the AIL on the tail LSN, each insertion has to walk the AIL to find the correct place to insert the new item into the AIL. This can consume large amounts of CPU time and block other operations from occurring while the traversals are in progress. To avoid this repeated walk, use a AIL cursor to record where we should be inserting the new items into the AIL without having to repeat the walk. The cursor infrastructure already provides this functionality for push walks, so is a simple extension of existing code. While this will not avoid the initial walk, it will avoid repeating it tens of thousands of times during a single checkpoint commit. This version includes logic improvements from Christoph Hellwig. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Alex Elder diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index c83f63b..efc147f 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -1426,6 +1426,7 @@ xfs_trans_committed( static inline void xfs_log_item_batch_insert( struct xfs_ail *ailp, + struct xfs_ail_cursor *cur, struct xfs_log_item **log_items, int nr_items, xfs_lsn_t commit_lsn) @@ -1434,7 +1435,7 @@ xfs_log_item_batch_insert( spin_lock(&ailp->xa_lock); /* xfs_trans_ail_update_bulk drops ailp->xa_lock */ - xfs_trans_ail_update_bulk(ailp, log_items, nr_items, commit_lsn); + xfs_trans_ail_update_bulk(ailp, cur, log_items, nr_items, commit_lsn); for (i = 0; i < nr_items; i++) IOP_UNPIN(log_items[i], 0); @@ -1452,6 +1453,13 @@ xfs_log_item_batch_insert( * as an iclog write error even though we haven't started any IO yet. Hence in * this case all we need to do is IOP_COMMITTED processing, followed by an * IOP_UNPIN(aborted) call. + * + * The AIL cursor is used to optimise the insert process. If commit_lsn is not + * at the end of the AIL, the insert cursor avoids the need to walk + * the AIL to find the insertion point on every xfs_log_item_batch_insert() + * call. This saves a lot of needless list walking and is a net win, even + * though it slightly increases that amount of AIL lock traffic to set it up + * and tear it down. */ void xfs_trans_committed_bulk( @@ -1463,8 +1471,13 @@ xfs_trans_committed_bulk( #define LOG_ITEM_BATCH_SIZE 32 struct xfs_log_item *log_items[LOG_ITEM_BATCH_SIZE]; struct xfs_log_vec *lv; + struct xfs_ail_cursor cur; int i = 0; + spin_lock(&ailp->xa_lock); + xfs_trans_ail_cursor_last(ailp, &cur, commit_lsn); + spin_unlock(&ailp->xa_lock); + /* unpin all the log items */ for (lv = log_vector; lv; lv = lv->lv_next ) { struct xfs_log_item *lip = lv->lv_item; @@ -1493,7 +1506,9 @@ xfs_trans_committed_bulk( /* * Not a bulk update option due to unusual item_lsn. * Push into AIL immediately, rechecking the lsn once - * we have the ail lock. Then unpin the item. + * we have the ail lock. Then unpin the item. This does + * not affect the AIL cursor the bulk insert path is + * using. */ spin_lock(&ailp->xa_lock); if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0) @@ -1507,7 +1522,7 @@ xfs_trans_committed_bulk( /* Item is a candidate for bulk AIL insert. */ log_items[i++] = lv->lv_item; if (i >= LOG_ITEM_BATCH_SIZE) { - xfs_log_item_batch_insert(ailp, log_items, + xfs_log_item_batch_insert(ailp, &cur, log_items, LOG_ITEM_BATCH_SIZE, commit_lsn); i = 0; } @@ -1515,7 +1530,11 @@ xfs_trans_committed_bulk( /* make sure we insert the remainder! */ if (i) - xfs_log_item_batch_insert(ailp, log_items, i, commit_lsn); + xfs_log_item_batch_insert(ailp, &cur, log_items, i, commit_lsn); + + spin_lock(&ailp->xa_lock); + xfs_trans_ail_cursor_done(ailp, &cur); + spin_unlock(&ailp->xa_lock); } /* diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 5fc2380..9a69dc0 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -272,9 +272,9 @@ xfs_trans_ail_cursor_clear( } /* - * Return the item in the AIL with the current lsn. - * Return the current tree generation number for use - * in calls to xfs_trans_next_ail(). + * Initialise the cursor to the first item in the AIL with the given @lsn. + * This searches the list from lowest LSN to highest. Pass a @lsn of zero + * to initialise the cursor to the first item in the AIL. */ xfs_log_item_t * xfs_trans_ail_cursor_first( @@ -300,31 +300,97 @@ out: } /* - * splice the log item list into the AIL at the given LSN. + * Initialise the cursor to the last item in the AIL with the given @lsn. + * This searches the list from highest LSN to lowest. If there is no item with + * the value of @lsn, then it sets the cursor to the last item with an LSN lower + * than @lsn. + */ +static struct xfs_log_item * +__xfs_trans_ail_cursor_last( + struct xfs_ail *ailp, + xfs_lsn_t lsn) +{ + xfs_log_item_t *lip; + + list_for_each_entry_reverse(lip, &ailp->xa_ail, li_ail) { + if (XFS_LSN_CMP(lip->li_lsn, lsn) <= 0) + return lip; + } + return NULL; +} + +/* + * Initialise the cursor to the last item in the AIL with the given @lsn. + * This searches the list from highest LSN to lowest. + */ +struct xfs_log_item * +xfs_trans_ail_cursor_last( + struct xfs_ail *ailp, + struct xfs_ail_cursor *cur, + xfs_lsn_t lsn) +{ + xfs_trans_ail_cursor_init(ailp, cur); + cur->item = __xfs_trans_ail_cursor_last(ailp, lsn); + return cur->item; +} + +/* + * splice the log item list into the AIL at the given LSN. We splice to the + * tail of the given LSN to maintain insert order for push traversals. The + * cursor is optional, allowing repeated updates to the same LSN to avoid + * repeated traversals. */ static void xfs_ail_splice( - struct xfs_ail *ailp, - struct list_head *list, - xfs_lsn_t lsn) + struct xfs_ail *ailp, + struct xfs_ail_cursor *cur, + struct list_head *list, + xfs_lsn_t lsn) { - xfs_log_item_t *next_lip; + struct xfs_log_item *lip = cur ? cur->item : NULL; + struct xfs_log_item *next_lip; - /* If the list is empty, just insert the item. */ - if (list_empty(&ailp->xa_ail)) { - list_splice(list, &ailp->xa_ail); - return; + /* + * Get a new cursor if we don't have a placeholder or the existing one + * has been invalidated. + */ + if (!lip || (__psint_t)lip & 1) { + lip = __xfs_trans_ail_cursor_last(ailp, lsn); + + if (!lip) { + /* The list is empty, so just splice and return. */ + if (cur) + cur->item = NULL; + list_splice(list, &ailp->xa_ail); + return; + } } - list_for_each_entry_reverse(next_lip, &ailp->xa_ail, li_ail) { - if (XFS_LSN_CMP(next_lip->li_lsn, lsn) <= 0) - break; + /* + * Our cursor points to the item we want to insert _after_, so we have + * to update the cursor to point to the end of the list we are splicing + * in so that it points to the correct location for the next splice. + * i.e. before the splice + * + * lsn -> lsn -> lsn + x -> lsn + x ... + * ^ + * | cursor points here + * + * After the splice we have: + * + * lsn -> lsn -> lsn -> lsn -> .... -> lsn -> lsn + x -> lsn + x ... + * ^ ^ + * | cursor points here | needs to move here + * + * So we set the cursor to the last item in the list to be spliced + * before we execute the splice, resulting in the cursor pointing to + * the correct item after the splice occurs. + */ + if (cur) { + next_lip = list_entry(list->prev, struct xfs_log_item, li_ail); + cur->item = next_lip; } - - ASSERT(&next_lip->li_ail == &ailp->xa_ail || - XFS_LSN_CMP(next_lip->li_lsn, lsn) <= 0); - - list_splice_init(list, &next_lip->li_ail); + list_splice(list, &lip->li_ail); } /* @@ -645,6 +711,7 @@ xfs_trans_unlocked_item( void xfs_trans_ail_update_bulk( struct xfs_ail *ailp, + struct xfs_ail_cursor *cur, struct xfs_log_item **log_items, int nr_items, xfs_lsn_t lsn) __releases(ailp->xa_lock) @@ -674,7 +741,7 @@ xfs_trans_ail_update_bulk( list_add(&lip->li_ail, &tmp); } - xfs_ail_splice(ailp, &tmp, lsn); + xfs_ail_splice(ailp, cur, &tmp, lsn); if (!mlip_changed) { spin_unlock(&ailp->xa_lock); diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index 6b164e9..c0cb408 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -82,6 +82,7 @@ struct xfs_ail { extern struct workqueue_struct *xfs_ail_wq; /* AIL workqueue */ void xfs_trans_ail_update_bulk(struct xfs_ail *ailp, + struct xfs_ail_cursor *cur, struct xfs_log_item **log_items, int nr_items, xfs_lsn_t lsn) __releases(ailp->xa_lock); static inline void @@ -90,7 +91,7 @@ xfs_trans_ail_update( struct xfs_log_item *lip, xfs_lsn_t lsn) __releases(ailp->xa_lock) { - xfs_trans_ail_update_bulk(ailp, &lip, 1, lsn); + xfs_trans_ail_update_bulk(ailp, NULL, &lip, 1, lsn); } void xfs_trans_ail_delete_bulk(struct xfs_ail *ailp, @@ -111,10 +112,13 @@ xfs_lsn_t xfs_ail_min_lsn(struct xfs_ail *ailp); void xfs_trans_unlocked_item(struct xfs_ail *, xfs_log_item_t *); -struct xfs_log_item *xfs_trans_ail_cursor_first(struct xfs_ail *ailp, +struct xfs_log_item * xfs_trans_ail_cursor_first(struct xfs_ail *ailp, struct xfs_ail_cursor *cur, xfs_lsn_t lsn); -struct xfs_log_item *xfs_trans_ail_cursor_next(struct xfs_ail *ailp, +struct xfs_log_item * xfs_trans_ail_cursor_last(struct xfs_ail *ailp, + struct xfs_ail_cursor *cur, + xfs_lsn_t lsn); +struct xfs_log_item * xfs_trans_ail_cursor_next(struct xfs_ail *ailp, struct xfs_ail_cursor *cur); void xfs_trans_ail_cursor_done(struct xfs_ail *ailp, struct xfs_ail_cursor *cur); _______________________________________________ xfs mailing list xfs@oss.sgi.com http://oss.sgi.com/mailman/listinfo/xfs commit bc6e588a8971aa74c02e42db4d6e0248679f3738 upstream If an item was locked we should not update xa_last_pushed_lsn and thus skip it when restarting the AIL scan as we need to be able to lock and write it out as soon as possible. Otherwise heavy lock contention might starve AIL pushing too easily, especially given the larger backoff once we moved xa_last_pushed_lsn all the way to the target lsn. Signed-off-by: Christoph Hellwig Reported-by: Stefan Priebe Tested-by: Stefan Priebe Index: xfs/fs/xfs/xfs_trans_ail.c =================================================================== --- xfs.orig/fs/xfs/xfs_trans_ail.c 2011-10-14 14:42:03.004395373 +0200 +++ xfs/fs/xfs/xfs_trans_ail.c 2011-10-14 14:42:22.687898198 +0200 @@ -491,7 +491,6 @@ xfs_ail_worker( case XFS_ITEM_LOCKED: XFS_STATS_INC(xs_push_ail_locked); - ailp->xa_last_pushed_lsn = lsn; stuck++; break; _______________________________________________ xfs mailing list xfs@oss.sgi.com http://oss.sgi.com/mailman/listinfo/xfs commit 17b38471c3c07a49f0bbc2ecc2e92050c164e226 upstream We need to check for pinned buffers even in .iop_pushbuf given that inode items flush into the same buffers that may be pinned directly due operations on the unlinked inode list operating directly on buffers. To do this add a return value to .iop_pushbuf that tells the AIL push about this and use the existing log force mechanisms to unpin it. Signed-off-by: Christoph Hellwig Reported-by: Stefan Priebe Tested-by: Stefan Priebe Index: xfs/fs/xfs/quota/xfs_dquot_item.c =================================================================== --- xfs.orig/fs/xfs/quota/xfs_dquot_item.c 2011-10-14 14:41:41.036231498 +0200 +++ xfs/fs/xfs/quota/xfs_dquot_item.c 2011-10-14 14:44:09.276394842 +0200 @@ -183,13 +183,14 @@ xfs_qm_dqunpin_wait( * search the buffer cache can be a time consuming thing, and AIL lock is a * spinlock. */ -STATIC void +STATIC bool xfs_qm_dquot_logitem_pushbuf( struct xfs_log_item *lip) { struct xfs_dq_logitem *qlip = DQUOT_ITEM(lip); struct xfs_dquot *dqp = qlip->qli_dquot; struct xfs_buf *bp; + bool ret = true; ASSERT(XFS_DQ_IS_LOCKED(dqp)); @@ -201,17 +202,20 @@ xfs_qm_dquot_logitem_pushbuf( if (completion_done(&dqp->q_flush) || !(lip->li_flags & XFS_LI_IN_AIL)) { xfs_dqunlock(dqp); - return; + return true; } bp = xfs_incore(dqp->q_mount->m_ddev_targp, qlip->qli_format.qlf_blkno, dqp->q_mount->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK); xfs_dqunlock(dqp); if (!bp) - return; + return true; if (XFS_BUF_ISDELAYWRITE(bp)) xfs_buf_delwri_promote(bp); + if (XFS_BUF_ISPINNED(bp)) + ret = false; xfs_buf_relse(bp); + return ret; } /* Index: xfs/fs/xfs/xfs_buf_item.c =================================================================== --- xfs.orig/fs/xfs/xfs_buf_item.c 2011-10-14 14:41:41.000000000 +0200 +++ xfs/fs/xfs/xfs_buf_item.c 2011-10-14 14:44:24.367895813 +0200 @@ -632,7 +632,7 @@ xfs_buf_item_push( * the xfsbufd to get this buffer written. We have to unlock the buffer * to allow the xfsbufd to write it, too. */ -STATIC void +STATIC bool xfs_buf_item_pushbuf( struct xfs_log_item *lip) { @@ -646,6 +646,7 @@ xfs_buf_item_pushbuf( xfs_buf_delwri_promote(bp); xfs_buf_relse(bp); + return true; } STATIC void Index: xfs/fs/xfs/xfs_inode_item.c =================================================================== --- xfs.orig/fs/xfs/xfs_inode_item.c 2011-10-14 14:41:41.000000000 +0200 +++ xfs/fs/xfs/xfs_inode_item.c 2011-10-14 14:44:19.323950541 +0200 @@ -713,13 +713,14 @@ xfs_inode_item_committed( * marked delayed write. If that's the case, we'll promote it and that will * allow the caller to write the buffer by triggering the xfsbufd to run. */ -STATIC void +STATIC bool xfs_inode_item_pushbuf( struct xfs_log_item *lip) { struct xfs_inode_log_item *iip = INODE_ITEM(lip); struct xfs_inode *ip = iip->ili_inode; struct xfs_buf *bp; + bool ret = true; ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED)); @@ -730,7 +731,7 @@ xfs_inode_item_pushbuf( if (completion_done(&ip->i_flush) || !(lip->li_flags & XFS_LI_IN_AIL)) { xfs_iunlock(ip, XFS_ILOCK_SHARED); - return; + return true; } bp = xfs_incore(ip->i_mount->m_ddev_targp, iip->ili_format.ilf_blkno, @@ -738,10 +739,13 @@ xfs_inode_item_pushbuf( xfs_iunlock(ip, XFS_ILOCK_SHARED); if (!bp) - return; + return true; if (XFS_BUF_ISDELAYWRITE(bp)) xfs_buf_delwri_promote(bp); + if (XFS_BUF_ISPINNED(bp)) + ret = false; xfs_buf_relse(bp); + return ret; } /* Index: xfs/fs/xfs/xfs_trans.h =================================================================== --- xfs.orig/fs/xfs/xfs_trans.h 2011-10-14 14:41:41.000000000 +0200 +++ xfs/fs/xfs/xfs_trans.h 2011-10-14 14:43:45.308394072 +0200 @@ -350,7 +350,7 @@ typedef struct xfs_item_ops { void (*iop_unlock)(xfs_log_item_t *); xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t); void (*iop_push)(xfs_log_item_t *); - void (*iop_pushbuf)(xfs_log_item_t *); + bool (*iop_pushbuf)(xfs_log_item_t *); void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t); } xfs_item_ops_t; Index: xfs/fs/xfs/xfs_trans_ail.c =================================================================== --- xfs.orig/fs/xfs/xfs_trans_ail.c 2011-10-14 14:42:22.000000000 +0200 +++ xfs/fs/xfs/xfs_trans_ail.c 2011-10-14 14:43:45.316393949 +0200 @@ -478,8 +478,13 @@ xfs_ail_worker( case XFS_ITEM_PUSHBUF: XFS_STATS_INC(xs_push_ail_pushbuf); - IOP_PUSHBUF(lip); - ailp->xa_last_pushed_lsn = lsn; + + if (!IOP_PUSHBUF(lip)) { + stuck++; + flush_log = 1; + } else { + ailp->xa_last_pushed_lsn = lsn; + } push_xfsbufd = 1; break; _______________________________________________ xfs mailing list xfs@oss.sgi.com http://oss.sgi.com/mailman/listinfo/xfs commit 0030807c66f058230bcb20d2573bcaf28852e804 upstream Currently we have a few issues with the way the workqueue code is used to implement AIL pushing: - it accidentally uses the same workqueue as the syncer action, and thus can be prevented from running if there are enough sync actions active in the system. - it doesn't use the HIGHPRI flag to queue at the head of the queue of work items At this point I'm not confident enough in getting all the workqueue flags and tweaks right to provide a perfectly reliable execution context for AIL pushing, which is the most important piece in XFS to make forward progress when the log fills. Revert back to use a kthread per filesystem which fixes all the above issues at the cost of having a task struct and stack around for each mounted filesystem. In addition this also gives us much better ways to diagnose any issues involving hung AIL pushing and removes a small amount of code. Signed-off-by: Christoph Hellwig Reported-by: Stefan Priebe Tested-by: Stefan Priebe Index: xfs/fs/xfs/xfs_trans_ail.c =================================================================== --- xfs.orig/fs/xfs/xfs_trans_ail.c 2011-10-14 14:43:45.316393949 +0200 +++ xfs/fs/xfs/xfs_trans_ail.c 2011-10-14 14:45:11.937395278 +0200 @@ -28,8 +28,6 @@ #include "xfs_trans_priv.h" #include "xfs_error.h" -struct workqueue_struct *xfs_ail_wq; /* AIL workqueue */ - #ifdef DEBUG /* * Check that the list is sorted as it should be. @@ -406,16 +404,10 @@ xfs_ail_delete( xfs_trans_ail_cursor_clear(ailp, lip); } -/* - * xfs_ail_worker does the work of pushing on the AIL. It will requeue itself - * to run at a later time if there is more work to do to complete the push. - */ -STATIC void -xfs_ail_worker( - struct work_struct *work) +static long +xfsaild_push( + struct xfs_ail *ailp) { - struct xfs_ail *ailp = container_of(to_delayed_work(work), - struct xfs_ail, xa_work); xfs_mount_t *mp = ailp->xa_mount; struct xfs_ail_cursor *cur = &ailp->xa_cursors; xfs_log_item_t *lip; @@ -556,20 +548,6 @@ out_done: /* We're past our target or empty, so idle */ ailp->xa_last_pushed_lsn = 0; - /* - * We clear the XFS_AIL_PUSHING_BIT first before checking - * whether the target has changed. If the target has changed, - * this pushes the requeue race directly onto the result of the - * atomic test/set bit, so we are guaranteed that either the - * the pusher that changed the target or ourselves will requeue - * the work (but not both). - */ - clear_bit(XFS_AIL_PUSHING_BIT, &ailp->xa_flags); - smp_rmb(); - if (XFS_LSN_CMP(ailp->xa_target, target) == 0 || - test_and_set_bit(XFS_AIL_PUSHING_BIT, &ailp->xa_flags)) - return; - tout = 50; } else if (XFS_LSN_CMP(lsn, target) >= 0) { /* @@ -592,9 +570,30 @@ out_done: tout = 20; } - /* There is more to do, requeue us. */ - queue_delayed_work(xfs_syncd_wq, &ailp->xa_work, - msecs_to_jiffies(tout)); + return tout; +} + +static int +xfsaild( + void *data) +{ + struct xfs_ail *ailp = data; + long tout = 0; /* milliseconds */ + + while (!kthread_should_stop()) { + if (tout && tout <= 20) + __set_current_state(TASK_KILLABLE); + else + __set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(tout ? + msecs_to_jiffies(tout) : MAX_SCHEDULE_TIMEOUT); + + try_to_freeze(); + + tout = xfsaild_push(ailp); + } + + return 0; } /* @@ -629,8 +628,9 @@ xfs_ail_push( */ smp_wmb(); xfs_trans_ail_copy_lsn(ailp, &ailp->xa_target, &threshold_lsn); - if (!test_and_set_bit(XFS_AIL_PUSHING_BIT, &ailp->xa_flags)) - queue_delayed_work(xfs_syncd_wq, &ailp->xa_work, 0); + smp_wmb(); + + wake_up_process(ailp->xa_task); } /* @@ -865,9 +865,18 @@ xfs_trans_ail_init( ailp->xa_mount = mp; INIT_LIST_HEAD(&ailp->xa_ail); spin_lock_init(&ailp->xa_lock); - INIT_DELAYED_WORK(&ailp->xa_work, xfs_ail_worker); + + ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild/%s", + ailp->xa_mount->m_fsname); + if (IS_ERR(ailp->xa_task)) + goto out_free_ailp; + mp->m_ail = ailp; return 0; + +out_free_ailp: + kmem_free(ailp); + return ENOMEM; } void @@ -876,6 +885,6 @@ xfs_trans_ail_destroy( { struct xfs_ail *ailp = mp->m_ail; - cancel_delayed_work_sync(&ailp->xa_work); + kthread_stop(ailp->xa_task); kmem_free(ailp); } Index: xfs/fs/xfs/xfs_trans_priv.h =================================================================== --- xfs.orig/fs/xfs/xfs_trans_priv.h 2011-10-14 14:42:03.000000000 +0200 +++ xfs/fs/xfs/xfs_trans_priv.h 2011-10-14 14:45:38.191895324 +0200 @@ -64,23 +64,17 @@ struct xfs_ail_cursor { */ struct xfs_ail { struct xfs_mount *xa_mount; + struct task_struct *xa_task; struct list_head xa_ail; xfs_lsn_t xa_target; struct xfs_ail_cursor xa_cursors; spinlock_t xa_lock; - struct delayed_work xa_work; xfs_lsn_t xa_last_pushed_lsn; - unsigned long xa_flags; }; -#define XFS_AIL_PUSHING_BIT 0 - /* * From xfs_trans_ail.c */ - -extern struct workqueue_struct *xfs_ail_wq; /* AIL workqueue */ - void xfs_trans_ail_update_bulk(struct xfs_ail *ailp, struct xfs_ail_cursor *cur, struct xfs_log_item **log_items, int nr_items, Index: xfs/fs/xfs/linux-2.6/xfs_linux.h =================================================================== --- xfs.orig/fs/xfs/linux-2.6/xfs_linux.h 2011-10-14 14:41:41.000000000 +0200 +++ xfs/fs/xfs/linux-2.6/xfs_linux.h 2011-10-14 14:45:11.941411722 +0200 @@ -70,6 +70,8 @@ #include #include #include +#include +#include #include #include Index: xfs/fs/xfs/linux-2.6/xfs_super.c =================================================================== --- xfs.orig/fs/xfs/linux-2.6/xfs_super.c 2011-10-14 14:46:38.497394866 +0200 +++ xfs/fs/xfs/linux-2.6/xfs_super.c 2011-10-14 14:46:49.047894210 +0200 @@ -1660,24 +1660,13 @@ xfs_init_workqueues(void) */ xfs_syncd_wq = alloc_workqueue("xfssyncd", WQ_CPU_INTENSIVE, 8); if (!xfs_syncd_wq) - goto out; - - xfs_ail_wq = alloc_workqueue("xfsail", WQ_CPU_INTENSIVE, 8); - if (!xfs_ail_wq) - goto out_destroy_syncd; - + return -ENOMEM; return 0; - -out_destroy_syncd: - destroy_workqueue(xfs_syncd_wq); -out: - return -ENOMEM; } STATIC void xfs_destroy_workqueues(void) { - destroy_workqueue(xfs_ail_wq); destroy_workqueue(xfs_syncd_wq); } _______________________________________________ xfs mailing list xfs@oss.sgi.com http://oss.sgi.com/mailman/listinfo/xfs Fixes a possible memory corruption when the link is larger than MAXPATHLEN and XFS_DEBUG is not enabled. This also remove the S_ISLNK assert, since the inode mode is checked previously in xfs_readlink_by_handle() and via VFS. Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_vnodeops.c | 11 ++++++++--- 1 files changed, 8 insertions(+), 3 deletions(-) diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 51fc429..c3288be 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -123,13 +123,18 @@ xfs_readlink( xfs_ilock(ip, XFS_ILOCK_SHARED); - ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFLNK); - ASSERT(ip->i_d.di_size <= MAXPATHLEN); - pathlen = ip->i_d.di_size; if (!pathlen) goto out; + if (pathlen > MAXPATHLEN) { + xfs_alert(mp, "%s: inode (%llu) symlink length (%d) too long", + __func__, (unsigned long long)ip->i_ino, pathlen); + ASSERT(0); + return XFS_ERROR(EFSCORRUPTED); + } + + if (ip->i_df.if_flags & XFS_IFINLINE) { memcpy(link, ip->i_df.if_u1.if_data, pathlen); link[pathlen] = '\0'; -- 1.7.6.2 _______________________________________________ xfs mailing list xfs@oss.sgi.com http://oss.sgi.com/mailman/listinfo/xfs